from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.error import  HTTPError
import re

class BaiduCrawler(object):
    '''
    百度百科爬虫
    '''
    def __init__(self, url):
        self.url = url
        self.pages = set()

    def getLink(self, pageUrl):
        try:
            html = urlopen(self.url + pageUrl)
        except HTTPError as e:
            print(e.msg)
            return
        bsObj = BeautifulSoup(html, "html.parser")
        for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
            if "href" in link.attrs:
                if link.attrs['href'] not in self.pages:
                    # new page
                    newPage = link.attrs['href']
                    print(self.url + newPage)
                    self.pages.add(newPage)
                    self.getLink(newPage)



